netlitThis vignette shows examples of assessing bias in literature review networks based on covariates from metadata about the studies and authors included or excluded from the review on redistricting in the main manuscript. Specifically, for each study, we collect metadata on the lead author’s gender, H-Index, and total number of citations. We then assess the impact of selecting studies on covariates in two ways:
First, we subset the network (e.g., to studies where the lead author is a man) and observe how many nodes and edges are missing in these subsets. This reveals the contributions of underrepresented scholars to the network by showing what we lose if they are excluded.
Second, we draw random samples of 100 studies weighted by covariates. This simulates a literature review that is biased (e.g., toward scholars who are men or have many citations). We then compare these biased samples to an unweighted random sample of studies in the network.
lit <- literature_long %>%
distinct(to, from) %>%
review()
lit## A netlit_review object with the following components:
##
## $edgelist
## - 69 edges
## - edge attributes: edge_betweenness
## $nodelist
## - 56 nodes
## - node attributes: degree_in, degree_out, degree_total, betweenness
## $graph
## an igraph object
# best seed 1,4, *5*
set.seed(5)
netlit_plot <- function(g){
ggraph(g, layout = 'fr') +
geom_node_point(
aes(color = degree_total %>% as.factor() ),
size = 6,
alpha = .7
) +
geom_edge_arc2(
start_cap = circle(3, 'mm'),
end_cap = circle(6, 'mm'),
aes(
color = edge_betweenness,
),
curvature = 0,
arrow = arrow(length = unit(2, 'mm'),
type = "open")
) +
geom_edge_loop(
start_cap = circle(5, 'mm'),
end_cap = circle(2, 'mm'),
aes( color = edge_betweenness),
n = 300,
strength = .6,
arrow = arrow(length = unit(2, 'mm'),
type = "open")
) +
geom_node_text( aes(label = name), size = 2.3) +
ggplot2::theme_void() +
theme(legend.position="bottom") +
labs(edge_color = "Edge Betweenness",
color = "Total Degree\nCentrality",
edge_linetype = "") +
scale_edge_color_viridis(option = "plasma",
begin = 0,
end = .9,
direction = -1,
guide = "legend") +
scale_color_viridis_d(option = "mako",
begin = 1,
end = .5)
}
g <- literature_long %>%
distinct(to, from) %>%
review() %>%
.$graph
g %>%
netlit_plot()# for plotting bias
netlit_bias_plot <- function(subgraph){
# lit with edge attribute indicating missing from subgraph
lit <- literature_long %>%
distinct(to, from) %>%
left_join( subgraph$edgelist %>% distinct(to, from) %>% mutate(missing_edges = "Not missing")
) %>%
mutate(missing_edges = replace_na(missing_edges, "Missing"))
lit %<>%
review(edge_attributes = names(lit))
# missing nodes
missing_nodes <- lit$nodelist$node[!lit$nodelist$node %in% subgraph$nodelist$node]
set.seed(5)
ggraph(lit$g, layout = 'fr') +
geom_node_point(
aes(color = ifelse(name %in% missing_nodes, "Missing", "Not Missing")),
size = 6,
alpha = .7
) +
geom_edge_arc2(
start_cap = circle(3, 'mm'),
end_cap = circle(6, 'mm'),
aes(
color = missing_edges,
),
curvature = 0,
arrow = arrow(length = unit(2, 'mm'),
type = "open")
) +
geom_edge_loop(
start_cap = circle(5, 'mm'),
end_cap = circle(2, 'mm'),
aes(color = missing_edges),
n = 300,
strength = .6,
arrow = arrow(length = unit(2, 'mm'),
type = "open")
) +
geom_node_text( aes(label = name), size = 2.3) +
ggplot2::theme_void() +
theme(legend.position="bottom") +
labs(edge_color = "",
color = "",
edge_linetype = "") +
scale_color_discrete() +
scale_edge_color_discrete()
}
literature_long %<>%
mutate(author_is_man = author_gender == "M")# biased sample weights
literature_long %<>%
mutate(unbiased = .5,
weight = case_when(
author_is_man ~ .6,
!author_is_man ~ .4,
TRUE~ .5
))
# a function to sample the network
sample_lit <- function(n, literature_long, prob){
# create an index for the sample
samp_idx <- sample(seq_len(nrow(literature_long)),
100, # 100 draws = number of studies to draw
prob=prob # with prob var provided
)
# subset sample to index
sample <- literature_long %>%
rowid_to_column() %>%
filter(rowid %in% samp_idx) %>%
distinct(to, from) %>%
review()
return(sample)
}n_samples <-1000There are 165 studies in the original literature review. We draw 100 of them—first at random, then weighted random samples. For each type of simulated bias we use 1000 draws.
random_samples <- map(1:n_samples, # 100 samples
sample_lit,
literature_long=literature_long,
prob = literature_long$unbiased)
samples <- random_samples
mean_edge_betw <- . %>% pull(edge_betweenness) %>% mean()
mean_node_betw <- . %>% pull(betweenness) %>% mean()
mean_node_degree <- . %>% pull(degree_total) %>% mean()
# make a table of the total number of nodes, edges, and the graph object for plotting
summarise_samples <- function(samples){
summary <- tibble(
#edge stats
edges = samples %>% map(1) %>% modify(nrow) %>% unlist(),
edge_between_mean = samples %>% map(1) %>% modify(mean_edge_betw) %>% unlist(),
# nodes stats
nodes = samples %>% map(2) %>% modify(nrow) %>% unlist(),
node_between_mean = samples %>% map(2) %>% modify(mean_node_betw) %>% unlist(),
node_degree_mean = samples %>% map(2) %>% modify(mean_node_degree) %>% unlist(),
#graph stats
communities = samples %>% map(3) %>% modify(cluster_walktrap) %>% modify(length) %>% unlist(),
diameter = samples %>% map(3) %>% modify(diameter) %>% unlist(),
graph = samples %>% map(3)
)
return(summary)
}
summary <- summarise_samples(samples)
random <- summary %>% mutate(
sample = "Random"
)
# map(random$graph, netlit_plot)
map(random_samples[1:10], netlit_bias_plot) Average nodes recovered: 43.8
Average node betweenness recovered: 2.9607115
Average edges recovered: 46.94
Average edge betweenness recovered: 5.1360215
Average node degree recovered: 2.1470984
Average communities recovered: 10.12
Average diameter recovered: 4.65
# biased samples
gender_samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)
samples <- gender_samples
summary <- summarise_samples(samples)
gender <- summary %>% mutate(sample = "Gender bias favoring men")
# map(gender_samples[1:10], netlit_bias_plot)
map(gender_samples[1:10], netlit_bias_plot)Average nodes recovered: 44.25
Average node betweenness recovered: 2.9773211
Average edges recovered: 47.472
Average edge betweenness recovered: 5.1271756
Average node degree recovered: 2.1480552
Average communities recovered: 10.328
Average diameter recovered: 4.667
# biased sample weights
literature_long %<>%
mutate(weight = case_when(
author_is_man ~ 1,
!author_is_man ~ .3,
TRUE~ .5
))
# biased samples
gender_samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)
samples <- gender_samples
summary <- summarise_samples(samples)
gender <- summary %>% mutate(
sample = "Gender bias favoring men"
)
#map(gender$graph, netlit_plot)
map(gender_samples[1:10], netlit_bias_plot)Average nodes recovered: 45.339
Average node betweenness recovered: 3.3160159
Average edges recovered: 48.951
Average edge betweenness recovered: 5.5354191
Average node degree recovered: 2.1615376
Average communities recovered: 10.722
Average diameter recovered: 4.837
# biased sample weights
literature_long %<>%
mutate(weight = case_when(
author_is_man ~ .3,
!author_is_man ~ 1,
TRUE~ .5
))
gender_samples2 <- samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)
# biased samples
summary <- summarise_samples(samples)
gender2 <- summary %>% mutate(
sample = "Gender bias favoring women"
)
#map(gender$graph, netlit_plot)
map(gender_samples2[1:10], netlit_bias_plot)Average nodes recovered: 42.591
Average node betweenness recovered: 2.3101483
Average edges recovered: 44.627
Average edge betweenness recovered: 4.3232846
Average node degree recovered: 2.0983178
Average communities recovered: 9.96
Average diameter recovered: 4.249
(replacing NA HIndex with 0)
literature_long %<>%
mutate(author_h_index = replace_na(author_h_index, 0 ))
# biased samples
hindex_samples <- samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)
summary <- summarise_samples(samples)
hindex <- summary %>% mutate(
sample = "H-Index bias"
)
#map(gender$graph, netlit_plot)
map(hindex_samples[1:10], netlit_bias_plot)Average nodes recovered: 42.591
Average node betweenness recovered: 2.3101483
Average edges recovered: 44.627
Average edge betweenness recovered: 4.3232846
Average node degree recovered: 2.0983178
Average communities recovered: 9.96
Average diameter recovered: 4.249
(replacing NA author citations with 0)
literature_long %<>%
mutate(author_citations = replace_na(author_citations, 0 ))
# gender-biased samples
citations_samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$author_citations)
samples <- citations_samples
summary <- summarise_samples(samples)
citations <- summary %>% mutate(
sample = "Citations bias"
)
# map(citations$graph, netlit_plot)
map(citations_samples[1:10], netlit_bias_plot) # %>% .[c(1:10)]Average nodes recovered: 46.811
Average node betweenness recovered: 3.754296
Average edges recovered: 51.905
Average edge betweenness recovered: 6.1469967
Average node degree recovered: 2.2184447
Average communities recovered: 10.823
Average diameter recovered: 4.638
s <- full_join(random, gender) %>%
full_join(gender2) %>%
full_join(hindex) %>%
full_join(citations)
round2 <- . %>% round(1)
s_table <- s %>% group_by(sample) %>%
select(where(is.numeric)) %>% summarise_all(mean) %>%
group_by(sample) %>%
mutate_all(round2) %>%
arrange(rev(sample))
color.me <- which(s_table$sample == "Random")
names(s_table) %<>% str_remove("_mean")
s_table %>%
kable(booktabs = T) %>%
kable_styling() | sample | edges | edge_between | nodes | node_between | node_degree | communities | diameter |
|---|---|---|---|---|---|---|---|
| Random | 46.9 | 5.1 | 43.8 | 3.0 | 2.1 | 10.1 | 4.7 |
| H-Index bias | 44.6 | 4.3 | 42.6 | 2.3 | 2.1 | 10.0 | 4.2 |
| Gender bias favoring women | 44.6 | 4.3 | 42.6 | 2.3 | 2.1 | 10.0 | 4.2 |
| Gender bias favoring men | 49.0 | 5.5 | 45.3 | 3.3 | 2.2 | 10.7 | 4.8 |
| Citations bias | 51.9 | 6.1 | 46.8 | 3.8 | 2.2 | 10.8 | 4.6 |
s %>%
ggplot() +
aes(x = nodes, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Nodes Recovered (out of 56)") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
s %>%
ggplot() +
aes(x = edges, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Edges Recovered (out of 69)") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
s %>%
ggplot() +
aes(x = edge_between_mean, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Average Edge Betweenness") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
s %>%
ggplot() +
aes(x = node_between_mean, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Average Node Betweenness") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
s %>%
ggplot() +
aes(x = node_degree_mean, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Average Degree") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
s %>%
ggplot() +
aes(x = communities, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Communities") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())
s %>%
ggplot() +
aes(x = diameter, fill = sample, color = sample) +
geom_density(alpha = .3) +
scale_color_viridis_d() +
scale_fill_viridis_d() +
theme_minimal() +
labs(color = "",
fill = "", y = "Density",
x = "Diameter") +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())